zinbModelFromW <- function(counts, sim_W){
mod <- model.matrix(~ sim_W)
zinb_sim <- zinbFit(counts, ncores = 3, X=mod, which_X_mu=1:3, which_X_pi=1:3,
commondispersion=FALSE)
true_alpha_mu <- zinb_sim@beta_mu[-1,]
true_alpha_pi <- zinb_sim@beta_pi[-1,]
true_beta_mu <- zinb_sim@beta_mu[1,,drop=FALSE]
true_beta_pi <- zinb_sim@beta_pi[1,,drop=FALSE]
true_gamma_mu <- zinb_sim@gamma_mu
true_gamma_pi <- zinb_sim@gamma_pi
true_zeta <- zinb_sim@zeta
zinbModel(W=sim_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi,
beta_mu=true_beta_mu, beta_pi=true_beta_pi,
zeta = true_zeta, gamma_mu=true_gamma_mu,
gamma_pi=true_gamma_pi)
}
simSummary = data.frame(realData = rep('Allen', 3),
K = rep(2, 3),
nclusters = rep(3, 3),
corMuPi = rep('no', 3),
dim = rep('1000x100', 3),
zeroFract = c(.25, .5, .75),
nreplicates = 10)
kable(simSummary)
| realData | K | nclusters | corMuPi | dim | zeroFract | nreplicates |
|---|---|---|---|---|---|---|
| Allen | 2 | 3 | no | 1000x100 | 0.25 | 10 |
| Allen | 2 | 3 | no | 1000x100 | 0.50 | 10 |
| Allen | 2 | 3 | no | 1000x100 | 0.75 | 10 |
To simulate a dataset, steps are as follow
Using this procedure, we have the simulated counts and true \(W\), \(\gamma_{\mu}\), \(\gamma_{\pi}\), \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\beta_{\mu}\), \(\beta_{\pi}\), and \(\zeta\).
To simulate the three datasets from Allen real dataset, the same 100 cells were used.
data("allen")
prefilter <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
which(colData(allen)$Core.Type=="Core")]
sampleCells <- sample(ncol(prefilter), 200)
postfilter <- prefilter[, sampleCells]
filterGenes <- apply(assay(postfilter) > 5, 1, sum) >= 5
postfilter <- postfilter[filterGenes, ]
bio = bioIni = as.factor(colData(postfilter)$driver_1_s)
cl <- as.factor(colData(postfilter)$Primary.Type)
postfilter <- assay(postfilter)
vars <- rowVars(log1p(postfilter))
names(vars) <- rownames(postfilter)
vars <- sort(vars, decreasing = TRUE)
pZero <- rowMeans(log1p(postfilter) == 0)
names(pZero) <- rownames(postfilter)
pZero = pZero[names(vars)]
chosenGenes = c(names(pZero)[pZero > 0.75][1:100],
names(pZero)[pZero<0.75 & pZero>0.6][1:400],
names(pZero)[pZero<0.6 & pZero>0.2][1:400],
names(pZero)[pZero<0.2][1:100])
core <- postfilter[chosenGenes, ]
#sum(core == 0)/(nrow(core) * ncol(core))
The dimensions are
dim(core)
## [1] 1000 200
par(mfrow = c(1,2))
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, 200 cells")
#legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, 200 cells")
Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
commondispersion = FALSE)))
## user system elapsed
## 378.820 214.884 244.504
100 cells are selected. Now, the dimensions are
estW = zinb@W
filt1 = estW[,1] > -1 & bio == 'Rbp4-Cre_KL100'
estW = estW[!filt1, ]
bio = bio[!filt1]
core = core[,!filt1]
filt2 = estW[,2] < 0 & bio == 'Scnn1a-Tg3-Cre'
estW = estW[!filt2, ]
bio = bio[!filt2]
core = core[,!filt2]
chosen = sample(nrow(estW), 100, replace = F)
bio = bio[chosen]
estW = estW[chosen, ]
core5 = core[,chosen]
dim(core5)
## [1] 1000 100
chosenCells = colnames(core5)
detection_rate <- colSums(core5>0)
coverage <- colSums(core5)
print(system.time(zinb5 <- zinbFit(core5, ncores = 3, K = 2,
commondispersion = FALSE)))
## user system elapsed
## 233.583 125.569 142.735
par(mfrow = c(1,3))
plot(zinb@W, col=cols[bioIni], pch=19, xlab="W1", ylab="W2",
main="Fitted W, 200 cells")
plot(estW, col=cols[bio], pch=19, xlab="W1", ylab="W2",
main="Fitted W, 100 cells")
plot(zinb5@W, col=cols[bio], pch=19, xlab="W1", ylab="W2",
main="Fitted W, 100 cells, refit")
zinb = zinb5
W = data.frame(W1 = zinb@W[,1], W2 = zinb@W[,2], bio = bio)
ggplot(melt(W), aes(value, fill = bio)) + geom_density(alpha = 0.2) +
facet_grid(~ variable) + xlab('fitted W') + theme_bw()
W simulated with \[W_1^{Scnn1a} \sim \mathcal{N}(1, .3)\] \[W_2^{Scnn1a} \sim \mathcal{N}(3, .3)\] \[W_1^{Rbp4} \sim \mathcal{N}(2, .3)\] \[W_2^{Rbp4} \sim \mathcal{N}(-3, .3)\] \[W_1^{Ntsr1} \sim \mathcal{N}(-4, .4)\] \[W_2^{Ntsr1} \sim \mathcal{N}(0, .4)\] TODO: change numeric values.
Sc = grepl('^Scnn1a', W$bio)
Rb = grepl('^Rbp4', W$bio)
Nt = grepl('^Ntsr1', W$bio)
W[Sc, 'W1sim'] = rnorm(sum(Sc), 1, .3)
W[Sc, 'W2sim'] = rnorm(sum(Sc), 3, .3)
W[Rb, 'W1sim'] = rnorm(sum(Rb), 2, .3)
W[Rb, 'W2sim'] = rnorm(sum(Rb), -3, .3)
W[Nt, 'W1sim'] = rnorm(sum(Nt), -4, .4)
W[Nt, 'W2sim'] = rnorm(sum(Nt), 0, .4)
par(mfrow=c(1, 2))
plot(zinb@W, col=cols[bio], pch=19, xlab="W1", ylab="W2",
main="Fitted")
plot(W[,c('W1sim', 'W2sim')], col=cols[bio], pch=19, xlab="W1", ylab="W2",
main="Simulated")
sim_W <- as.matrix(W[, c('W1sim', 'W2sim')])
simModel = simModel5 = zinbModelFromW(core5, sim_W)
simData = simData5 = zinbSim(simModel, seed = 8746)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
gamma_pi = simModel@gamma_pi[1, ],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.2777558 0.3361947 0.8805161
## gamma_pi -0.2777558 1.0000000 -0.9845389 -0.5059706
## detection_rate 0.3361947 -0.9845389 1.0000000 0.5370054
## coverage 0.8805161 -0.5059706 0.5370054 1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
alpha_mu_2 = simModel@alpha_mu[2, ],
alpha_pi_1 = simModel@alpha_pi[1, ],
alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.00000000 0.1434140 -0.28012445 -0.03256204
## alpha_mu_2 0.14341402 1.0000000 -0.12087510 -0.39915616
## alpha_pi_1 -0.28012445 -0.1208751 1.00000000 0.03098031
## alpha_pi_2 -0.03256204 -0.3991562 0.03098031 1.00000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.4648914
## beta_pi -0.4648914 1.0000000
chosenGenes = c(names(pZero)[pZero > 0.4][1:50],
names(pZero)[pZero<0.4 & pZero>0.2][1:550],
names(pZero)[pZero<0.2][1:400])
core25 <- postfilter[chosenGenes, chosenCells]
sum(core25 == 0)/(nrow(core25) * ncol(core25))
## [1] 0.25206
simModel = simModel25 = zinbModelFromW(core25, sim_W)
simData = simData25 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core25>0)
coverage <- colSums(core25)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
gamma_pi = simModel@gamma_pi[1, ],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.3790699 0.4263121 0.9053465
## gamma_pi -0.3790699 1.0000000 -0.9863453 -0.4990699
## detection_rate 0.4263121 -0.9863453 1.0000000 0.5474675
## coverage 0.9053465 -0.4990699 0.5474675 1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
alpha_mu_2 = simModel@alpha_mu[2, ],
alpha_pi_1 = simModel@alpha_pi[1, ],
alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.000000000 0.0751755 -0.30550364 0.007498819
## alpha_mu_2 0.075175503 1.0000000 -0.10965711 -0.342199866
## alpha_pi_1 -0.305503638 -0.1096571 1.00000000 -0.063618412
## alpha_pi_2 0.007498819 -0.3421999 -0.06361841 1.000000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.1039886
## beta_pi -0.1039886 1.0000000
chosenGenes = c(names(pZero)[pZero > 0.8][1:800],
names(pZero)[pZero < 0.8 & pZero > 0.3][1:100],
names(pZero)[pZero < 0.3][1:100])
core75 <- postfilter[chosenGenes, chosenCells]
sum(core75 == 0)/(nrow(core75) * ncol(core75))
## [1] 0.74562
simModel = simModel75 = zinbModelFromW(core75, sim_W)
simData = simData75 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core75>0)
coverage <- colSums(core75)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
gamma_pi = simModel@gamma_pi[1, ],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.2323312 0.3260281 0.8849565
## gamma_pi -0.2323312 1.0000000 -0.9801307 -0.5076748
## detection_rate 0.3260281 -0.9801307 1.0000000 0.5622813
## coverage 0.8849565 -0.5076748 0.5622813 1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
alpha_mu_2 = simModel@alpha_mu[2, ],
alpha_pi_1 = simModel@alpha_pi[1, ],
alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.0000000000 0.16801277 0.10911997 0.0006377646
## alpha_mu_2 0.1680127680 1.00000000 -0.03405831 0.0453612814
## alpha_pi_1 0.1091199731 -0.03405831 1.00000000 0.0806254646
## alpha_pi_2 0.0006377646 0.04536128 0.08062546 1.0000000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.00000000 -0.02579542
## beta_pi -0.02579542 1.00000000
core = core5
zinb = zinb5
obs_detection_rate <- colMeans(core>0)
obs_coverage <- colSums(core)
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb)), obs_detection_rate,
xlab="Average fitted Pi", ylab="Detection Rate",
pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(zinb))), log1p(obs_coverage), xlim = c(4, 7),
xlab="Average fitted log Mu", ylab="log Coverage", pch=19, ylim = c(9,14),
col=cols[bio])
core = core5
simData = simData5
simModel = simModel5
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)
par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
xlab="Average simulated Pi", ylab="True Detection Rate",
pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=cols[bio],ylim = c(9,14))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
xlab = "Mean Expression", main = 'Simulated',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
core = core25
simData = simData25
simModel = simModel25
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)
par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
xlab="Average simulated Pi", ylab="True Detection Rate",
pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=cols[bio],ylim = c(9,14))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
xlab = "Mean Expression", main = 'Simulated',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
simData = simData75
simModel = simModel75
core = core75
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)
par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
xlab="Average simulated Pi", ylab="True Detection Rate",
pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=cols[bio],ylim = c(9,14))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
xlab = "Mean Expression", main = 'Simulated',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
B = 10
bio = as.vector(bio)
originalCounts = core5
simModel = simModel5
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen5.rda")
zero5 = sapply(simData, function(x) x$zeroFraction)
originalCounts = core25
simModel = simModel25
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen25.rda")
zero25 = sapply(simData, function(x) x$zeroFraction)
originalCounts = core75
simModel = simModel75
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen75.rda")
zero75 = sapply(simData, function(x) x$zeroFraction)
zeroFrac = data.frame(zero = c(zero5, zero25, zero75),
perc = c(rep(.5, 10), rep(.25, 10), rep(.75, 10)))
zeroFrac$perc = factor(zeroFrac$perc)
zeroFrac = melt(zeroFrac)
ggplot(zeroFrac, aes(x = perc, y = value)) + geom_boxplot() +
ylab('zero fraction') + xlab('wanted zero fraction') + ggtitle('one boxplot = 10 replicates')